import pandas as pd
import numpy as np

# movie_genre.py 是生成表格movie_genre.csv的程序，提取了电影以及他们对应的类型，具体情况参见表格

summer_movies = pd.read_csv(r"DSProject\csv\summer_movies.csv")
# 去除 genres 列中包含 NaN 的行
summer_movies = summer_movies.dropna(subset=["genres"])

# 重置索引
summer_movies = summer_movies.reset_index(drop=True)
genres = summer_movies["genres"].to_list()
all_genres = []
for genre in genres:
    genre = str(genre)
    singal_genre = genre.split(",")
    all_genres += singal_genre

unique_genres = sorted(list(set(all_genres)))

genres_dummy = []

for genre in unique_genres:
    genre = "is_" + genre
    genres_dummy.append(genre)
movie_genre = summer_movies[["tconst", "genres"]].copy()

for genre in genres_dummy:
    movie_genre[genre] = np.nan


def extract_genres(genres):
    genres = str(genres)
    genres = genres.split(",")
    return genres


movie_genre["genres"] = movie_genre["genres"].apply(extract_genres)

for t_idx in range(movie_genre["tconst"].count()):
    for genre_idx, genre in enumerate(unique_genres):
        genre_idx += 2
        if genre in movie_genre.iloc[t_idx, 1]:
            movie_genre.iloc[t_idx, genre_idx] = 1
        else:
            movie_genre.iloc[t_idx, genre_idx] = 0

movie_genre.to_csv(r"DSProject\csv\movie_genre.csv", index=False)
